train <- read.table("../data/rawdata/adult.data.txt", sep = ",", na.strings = "?",
strip.white = T)
test <- read.table("../data/rawdata/adult.test.txt", sep = ",", na.strings = "?",
strip.white = T)
dim(train)
## [1] 32561 15
dim(test)
## [1] 16281 15
colnames(train) <- c("age", "workclass", "fnlwgt", "education", "education-num",
"marital-status", "occupation", "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
colnames(test) <- c("age", "workclass", "fnlwgt", "education", "education-num",
"marital-status", "occupation", "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
#Find missing values and NAs for training set.
for(i in 1:ncol(train)){
cat("<names of NA rows in", colnames(train)[i], "variable>", "\n")
cat(rownames(train)[is.na(train[, i])], "\n")
cat("Number of NA values: ", length(rownames(train)[is.na(train[, i])]), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain missing values in", colnames(train)[i], "variable>", "\n")
cat(rownames(train[which(train[, i] == ""), ]), "\n")
cat("Number of Missing values : ", length(rownames(train[which(train[, i] == ""), ])), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain ? values in", colnames(train)[i], "variable>", "\n")
cat(rownames(train[which(train[, i] == " ?"), ]), "\n")
cat("Number of ? values : ", length(rownames(train[which(train[, i] == " ?"), ])), "\n")
print("======================================")
print("======================================")
}
## <names of NA rows in age variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable>
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32311 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543
## Number of NA values: 1836
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable>
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5362 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10846 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14773 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20338 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23233 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32305 32311 32315 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543
## Number of NA values: 1843
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable>
## 15 39 52 62 94 246 250 298 394 454 558 713 726 730 778 781 888 956 1027 1037 1116 1153 1159 1200 1225 1253 1327 1349 1392 1555 1558 1582 1594 1677 1712 1739 1819 1901 1991 2016 2100 2105 2182 2372 2513 2514 2519 2550 2573 2588 2592 2640 2718 2736 2776 2795 2910 2927 3024 3108 3132 3165 3167 3188 3201 3233 3248 3257 3462 3485 3496 3533 3580 3637 3835 3857 3859 4007 4157 4173 4198 4245 4302 4327 4397 4406 4463 4511 4579 4600 4640 4657 4659 4672 4773 4787 4828 5082 5181 5186 5202 5235 5310 5348 5375 5402 5451 5541 5648 5664 5684 5710 5824 5842 5855 5964 6006 6060 6130 6177 6187 6243 6320 6361 6365 6377 6396 6534 6677 6738 6845 7046 7073 7081 7097 7154 7167 7177 7254 7285 7328 7346 7399 7476 7616 7635 7689 7851 7862 7863 7903 7965 7991 8146 8161 8208 8226 8283 8357 8366 8478 8872 8904 8916 9016 9041 9238 9367 9419 9504 9538 9560 9581 9617 9625 9740 9786 9800 9850 9867 9986 10012 10063 10183 10185 10219 10289 10344 10354 10404 10409 10575 10635 10648 10675 10763 10778 10783 11148 11188 11222 11285 11301 11424 11447 11478 11596 11615 11653 11660 11984 11989 12005 12083 12115 12173 12261 12281 12316 12330 12363 12471 12561 12644 12656 12691 12696 12717 12749 12831 12900 12960 12974 12997 13089 13199 13202 13282 13306 13500 13604 13692 13748 13769 13818 13821 13827 13828 13898 13914 13919 13972 14044 14086 14103 14196 14235 14247 14341 14369 14411 14460 14563 14578 14583 14585 14593 14858 15024 15037 15137 15153 15162 15198 15220 15445 15476 15529 15595 15610 15614 15673 15679 15693 15735 15793 15864 15932 15933 15954 15989 16037 16080 16109 16142 16143 16232 16261 16267 16329 16382 16418 16440 16489 16501 16636 16648 16839 16863 16976 17022 17108 17194 17202 17275 17379 17453 17482 17483 17624 17648 17895 18066 18234 18278 18366 18413 18439 18460 18556 18586 18616 18673 18678 18907 18910 18983 19038 19047 19056 19170 19246 19257 19300 19317 19327 19347 19352 19415 19491 19533 19627 19677 19710 19728 19769 19785 19788 19947 19998 20204 20285 20334 20359 20465 20481 20500 20532 20633 20639 20658 20659 20717 20748 20848 21063 21109 21127 21135 21196 21227 21265 21383 21394 21532 21542 21557 21669 21723 21819 22003 22069 22107 22231 22242 22265 22318 22352 22430 22475 22541 22562 22615 22640 22678 22743 22772 22789 22791 22862 22908 22982 23033 23116 23174 23237 23285 23435 23441 23467 23471 23566 23638 23688 23705 23730 23785 23798 23893 23916 24214 24458 24466 24573 24593 24607 24663 24696 24751 24833 24891 24892 24924 24961 24981 25047 25106 25113 25236 25242 25276 25297 25314 25343 25360 25459 25479 25492 25505 25550 25575 25620 25630 25842 25871 26008 26198 26222 26235 26272 26297 26333 26364 26378 26406 26447 26461 26570 26617 26662 26763 26801 26901 26923 26941 26980 27020 27069 27134 27142 27300 27306 27377 27384 27670 28019 28045 28108 28125 28195 28196 28197 28221 28336 28344 28432 28483 28501 28506 28590 28619 28629 28689 28706 28836 28842 28913 28933 28938 29030 29034 29099 29105 29213 29256 29324 29358 29378 29402 29441 29524 29593 29681 29683 29739 29778 29787 29889 29982 30011 30106 30111 30171 30231 30275 30277 30303 30330 30370 30583 30639 30657 30671 30701 30774 30822 30903 30923 31090 31129 31337 31360 31388 31397 31469 31556 31638 31642 31702 31797 31945 32091 32170 32214 32233 32255 32308 32414 32450 32470 32493 32511 32526
## Number of NA values: 583
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
#Find missing values and NAs for testing set.
for(i in 1:ncol(test)){
cat("<names of NA rows in", colnames(test)[i], "variable>", "\n")
cat(rownames(test)[is.na(test[, i])], "\n")
cat("Number of NA values: ", length(rownames(test)[is.na(test[, i])]), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain missing values in", colnames(test)[i], "variable>", "\n")
cat(rownames(test[which(test[, i] == ""), ]), "\n")
cat("Number of Missing values : ", length(rownames(test[which(test[, i] == ""), ])), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain ? values in", colnames(test)[i], "variable>", "\n")
cat(rownames(test[which(test[, i] == " ?"), ]), "\n")
cat("Number of ? values : ", length(rownames(test[which(test[, i] == " ?"), ])), "\n")
print("======================================")
print("======================================")
}
## <names of NA rows in age variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable>
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278
## Number of NA values: 963
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable>
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8786 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11608 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13899 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278
## Number of NA values: 966
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable>
## 20 66 84 189 254 306 330 404 421 472 516 649 666 688 844 1009 1039 1164 1334 1365 1406 1616 1644 1801 1822 1823 1832 1941 2061 2096 2107 2161 2227 2264 2305 2318 2324 2350 2477 2489 2552 2585 2613 2630 2697 2703 2775 2886 3061 3075 3122 3160 3222 3440 3460 3485 3508 3672 3678 3730 3762 3786 3854 3867 4187 4409 4540 4545 4608 4643 4649 4697 4728 4748 4764 4911 4923 5053 5126 5149 5152 5171 5181 5420 5469 5497 5648 5662 5717 5732 5829 5837 5944 5973 6034 6048 6054 6180 6206 6208 6234 6372 6403 6518 6587 6762 6776 6798 6801 6863 6871 6876 7017 7047 7060 7167 7206 7232 7288 7355 7443 7598 7601 7677 7708 7721 7750 7817 8029 8044 8078 8161 8183 8265 8369 8378 8433 8600 8622 8634 8700 8774 8849 8938 8976 9057 9145 9180 9200 9240 9244 9254 9263 9297 9335 9340 9354 9358 9415 9436 9497 9552 9567 9581 9626 9635 9699 9740 9874 9957 9983 10048 10151 10157 10202 10208 10267 10334 10346 10356 10364 10409 10475 10476 10509 10711 10739 10842 11130 11314 11348 11390 11407 11610 11686 11733 11749 11762 11784 11889 11946 12371 12386 12398 12415 12436 12456 12506 12577 12579 12607 12626 12648 12725 12780 12797 12911 12990 13171 13241 13254 13293 13311 13362 13547 13550 13575 13614 13693 13721 13746 13760 13764 13792 13926 13931 13934 13971 13980 14005 14029 14030 14072 14189 14203 14225 14263 14334 14373 14407 14446 14547 14585 14611 14652 14732 15006 15015 15084 15091 15099 15185 15234 15321 15350 15397 15421 15481 15594 15685 15712 16044 16091 16266
## Number of NA values: 274
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
#Get percentage of missing values
apply(train, 2, function(x) sum(is.na(x)) / length(x)) * 100
## age workclass fnlwgt education education-num
## 0.000000 5.638647 0.000000 0.000000 0.000000
## marital-status occupation relationship race sex
## 0.000000 5.660146 0.000000 0.000000 0.000000
## capital-gain capital-loss hours-per-week native-country income
## 0.000000 0.000000 0.000000 1.790486 0.000000
apply(test, 2, function(x) sum(is.na(x)) / length(x)) * 100
## age workclass fnlwgt education education-num
## 0.000000 5.914870 0.000000 0.000000 0.000000
## marital-status occupation relationship race sex
## 0.000000 5.933296 0.000000 0.000000 0.000000
## capital-gain capital-loss hours-per-week native-country income
## 0.000000 0.000000 0.000000 1.682943 0.000000
#MICE package to see the pattern
md.pattern(train)
## age fnlwgt education education-num marital-status relationship race
## 30162 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1 1
## 556 1 1 1 1 1 1 1
## 1809 1 1 1 1 1 1 1
## 27 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## sex capital-gain capital-loss hours-per-week income native-country
## 30162 1 1 1 1 1 1
## 7 1 1 1 1 1 1
## 556 1 1 1 1 1 0
## 1809 1 1 1 1 1 1
## 27 1 1 1 1 1 0
## 0 0 0 0 0 583
## workclass occupation
## 30162 1 1 0
## 7 1 0 1
## 556 1 1 1
## 1809 0 0 2
## 27 0 0 3
## 1836 1843 4262
plot <- aggr(train, col = c('blue', 'yellow'),
numbers = TRUE, sortVars = TRUE,
labels = names(train), cex.axis = .7,
gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## occupation 0.05660146
## workclass 0.05638647
## native-country 0.01790486
## age 0.00000000
## fnlwgt 0.00000000
## education 0.00000000
## education-num 0.00000000
## marital-status 0.00000000
## relationship 0.00000000
## race 0.00000000
## sex 0.00000000
## capital-gain 0.00000000
## capital-loss 0.00000000
## hours-per-week 0.00000000
## income 0.00000000
md.pattern(test)
## age fnlwgt education education-num marital-status relationship race
## 15060 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 255 1 1 1 1 1 1 1
## 944 1 1 1 1 1 1 1
## 19 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## sex capital-gain capital-loss hours-per-week income native-country
## 15060 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 255 1 1 1 1 1 0
## 944 1 1 1 1 1 1
## 19 1 1 1 1 1 0
## 0 0 0 0 0 274
## workclass occupation
## 15060 1 1 0
## 3 1 0 1
## 255 1 1 1
## 944 0 0 2
## 19 0 0 3
## 963 966 2203
plot <- aggr(test, col = c('blue', 'yellow'),
numbers = TRUE, sortVars = TRUE,
labels = names(test), cex.axis = .7,
gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## occupation 0.05933296
## workclass 0.05914870
## native-country 0.01682943
## age 0.00000000
## fnlwgt 0.00000000
## education 0.00000000
## education-num 0.00000000
## marital-status 0.00000000
## relationship 0.00000000
## race 0.00000000
## sex 0.00000000
## capital-gain 0.00000000
## capital-loss 0.00000000
## hours-per-week 0.00000000
## income 0.00000000
# Hmisc package to impute missing values
# ww <- aregImpute(~ age + workclass + fnlwgt + education + `education-num` + `marital-status` +
# occupation + relationship + race + sex + `capital-gain` + `capital-loss` +
# `hours-per-week` + income,
# data = train, n.impute = 5, group = "income")
#mlr package to impute missing values
# newworkclass <- impute(train[,2], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
#
# newoccupation <- impute(train[,7], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
#
# newcountry <- impute(train[,14], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
#missForest package to impute missing values
# foresting <- missForest(train, maxiter = 5, ntree = 100)
# foresting$OOBerror
# newtrain <- foresting$ximp
# write.csv(newtrain, file = "../data/cleandata/newtrain.csv", col.names = T, row.names = F)
newtrain <- read.csv("../data/cleandata/newtrain.csv", header = T)
dim(newtrain)
## [1] 32561 15
# foresting2 <- missForest(test, maxiter = 5, ntree = 100)
# foresting2$OOBerror
# newtest <- foresting2$ximp
# write.csv(newtest, file = "../data/cleandata/newtest.csv", col.names = T, row.names = F)
newtest <- read.csv("../data/cleandata/newtest.csv", header = T)
dim(newtest)
## [1] 16281 15
#Check whether the data is messed up while imputing missing values
#They should never show 0, as we are supposed to see only missing value has been changed...
#Compare NA with new number in new data set should show NA, not 0.
t <- matrix(0, 1, ncol(train))
for(i in 1:20){
a <- sample.int(nrow(newtrain), 1)
t <- rbind(t, (newtrain[a, ] == train[a, ]))
}
t <- t[-1, ]
t
## age workclass fnlwgt education education.num marital.status
## 3051 1 1 1 1 1 1
## 31315 1 1 1 1 1 1
## 6084 1 1 1 1 1 1
## 29934 1 1 1 1 1 1
## 27188 1 1 1 1 1 1
## 25542 1 1 1 1 1 1
## 18652 1 1 1 1 1 1
## 3224 1 1 1 1 1 1
## 32399 1 1 1 1 1 1
## 9143 1 1 1 1 1 1
## 29546 1 1 1 1 1 1
## 26228 1 1 1 1 1 1
## 6168 1 1 1 1 1 1
## 30951 1 1 1 1 1 1
## 17038 1 1 1 1 1 1
## 26789 1 1 1 1 1 1
## 17202 1 1 1 1 1 1
## 29103 1 1 1 1 1 1
## 20814 1 1 1 1 1 1
## 7638 1 1 1 1 1 1
## occupation relationship race sex capital.gain capital.loss
## 3051 1 1 1 1 1 1
## 31315 1 1 1 1 1 1
## 6084 1 1 1 1 1 1
## 29934 1 1 1 1 1 1
## 27188 1 1 1 1 1 1
## 25542 1 1 1 1 1 1
## 18652 1 1 1 1 1 1
## 3224 1 1 1 1 1 1
## 32399 1 1 1 1 1 1
## 9143 1 1 1 1 1 1
## 29546 1 1 1 1 1 1
## 26228 1 1 1 1 1 1
## 6168 1 1 1 1 1 1
## 30951 1 1 1 1 1 1
## 17038 1 1 1 1 1 1
## 26789 1 1 1 1 1 1
## 17202 1 1 1 1 1 1
## 29103 1 1 1 1 1 1
## 20814 1 1 1 1 1 1
## 7638 1 1 1 1 1 1
## hours.per.week native.country income
## 3051 1 1 1
## 31315 1 1 1
## 6084 1 1 1
## 29934 1 1 1
## 27188 1 1 1
## 25542 1 1 1
## 18652 1 1 1
## 3224 1 1 1
## 32399 1 1 1
## 9143 1 1 1
## 29546 1 1 1
## 26228 1 1 1
## 6168 1 1 1
## 30951 1 1 1
## 17038 1 1 1
## 26789 1 1 1
## 17202 1 NA 1
## 29103 1 1 1
## 20814 1 1 1
## 7638 1 1 1
t2 <- matrix(0, 1, ncol(test))
for(i in 1:20){
a <- sample.int(nrow(newtest), 1)
t2 <- rbind(t2, (newtest[a, ] == test[a, ]))
}
t2 <- t2[-1, ]
t2
## age workclass fnlwgt education education.num marital.status
## 2501 1 1 1 1 1 1
## 1628 1 1 1 1 1 1
## 7916 1 1 1 1 1 1
## 2844 1 1 1 1 1 1
## 3819 1 1 1 1 1 1
## 5284 1 1 1 1 1 1
## 3915 1 1 1 1 1 1
## 12315 1 1 1 1 1 1
## 2626 1 1 1 1 1 1
## 6014 1 1 1 1 1 1
## 5583 1 1 1 1 1 1
## 11575 1 1 1 1 1 1
## 14876 1 1 1 1 1 1
## 12425 1 NA 1 1 1 1
## 8425 1 1 1 1 1 1
## 5838 1 1 1 1 1 1
## 511 1 1 1 1 1 1
## 1575 1 1 1 1 1 1
## 9847 1 1 1 1 1 1
## 2292 1 1 1 1 1 1
## occupation relationship race sex capital.gain capital.loss
## 2501 1 1 1 1 1 1
## 1628 1 1 1 1 1 1
## 7916 1 1 1 1 1 1
## 2844 1 1 1 1 1 1
## 3819 1 1 1 1 1 1
## 5284 1 1 1 1 1 1
## 3915 1 1 1 1 1 1
## 12315 1 1 1 1 1 1
## 2626 1 1 1 1 1 1
## 6014 1 1 1 1 1 1
## 5583 1 1 1 1 1 1
## 11575 1 1 1 1 1 1
## 14876 1 1 1 1 1 1
## 12425 NA 1 1 1 1 1
## 8425 1 1 1 1 1 1
## 5838 1 1 1 1 1 1
## 511 1 1 1 1 1 1
## 1575 1 1 1 1 1 1
## 9847 1 1 1 1 1 1
## 2292 1 1 1 1 1 1
## hours.per.week native.country income
## 2501 1 1 1
## 1628 1 1 1
## 7916 1 1 1
## 2844 1 1 1
## 3819 1 1 1
## 5284 1 1 1
## 3915 1 1 1
## 12315 1 1 1
## 2626 1 1 1
## 6014 1 1 1
## 5583 1 1 1
## 11575 1 1 1
## 14876 1 1 1
## 12425 1 1 1
## 8425 1 1 1
## 5838 1 1 1
## 511 1 1 1
## 1575 1 1 1
## 9847 1 1 1
## 2292 1 1 1
\(\\\)
\(\\\)
#See structure and summaries before removing outliers
str(newtest)
## 'data.frame': 16281 obs. of 15 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 50 40 40 30 30 40 32 40 10 ...
## $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
## $ income : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
## age workclass fnlwgt
## Min. :17.00 Private :11963 Min. : 13492
## 1st Qu.:28.00 Self-emp-not-inc: 1433 1st Qu.: 116736
## Median :37.00 Local-gov : 1090 Median : 177831
## Mean :38.77 State-gov : 710 Mean : 189436
## 3rd Qu.:48.00 Self-emp-inc : 594 3rd Qu.: 238384
## Max. :90.00 Federal-gov : 481 Max. :1490400
## (Other) : 10
## education education.num marital.status
## HS-grad :5283 Min. : 1.00 Divorced :2190
## Some-college:3587 1st Qu.: 9.00 Married-AF-spouse : 14
## Bachelors :2670 Median :10.00 Married-civ-spouse :7403
## Masters : 934 Mean :10.07 Married-spouse-absent: 210
## Assoc-voc : 679 3rd Qu.:12.00 Never-married :5434
## 11th : 637 Max. :16.00 Separated : 505
## (Other) :2491 Widowed : 525
## occupation relationship race
## Prof-specialty :2111 Husband :6523 Amer-Indian-Eskimo: 159
## Craft-repair :2040 Not-in-family :4278 Asian-Pac-Islander: 480
## Exec-managerial:2035 Other-relative: 525 Black : 1561
## Adm-clerical :1967 Own-child :2513 Other : 135
## Sales :1921 Unmarried :1679 White :13946
## Other-service :1825 Wife : 763
## (Other) :4382
## sex capital.gain capital.loss hours.per.week
## Female: 5421 Min. : 0 Min. : 0.0 Min. : 1.00
## Male :10860 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00
## Median : 0 Median : 0.0 Median :40.00
## Mean : 1082 Mean : 87.9 Mean :40.39
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :99999 Max. :3770.0 Max. :99.00
##
## native.country income
## United-States:14892 <=50K.:12435
## Mexico : 311 >50K. : 3846
## Philippines : 111
## Puerto-Rico : 70
## Germany : 69
## Canada : 61
## (Other) : 767
str(newtrain)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
## age workclass fnlwgt
## Min. :17.00 Private :24068 Min. : 12285
## 1st Qu.:28.00 Self-emp-not-inc: 2776 1st Qu.: 117827
## Median :37.00 Local-gov : 2193 Median : 178356
## Mean :38.58 State-gov : 1352 Mean : 189778
## 3rd Qu.:48.00 Self-emp-inc : 1164 3rd Qu.: 237051
## Max. :90.00 Federal-gov : 985 Max. :1484705
## (Other) : 23
## education education.num marital.status
## HS-grad :10501 Min. : 1.00 Divorced : 4443
## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
## 11th : 1175 Max. :16.00 Separated : 1025
## (Other) : 5134 Widowed : 993
## occupation relationship race
## Prof-specialty :4295 Husband :13193 Amer-Indian-Eskimo: 311
## Craft-repair :4162 Not-in-family : 8305 Asian-Pac-Islander: 1039
## Exec-managerial:4129 Other-relative: 981 Black : 3124
## Adm-clerical :3992 Own-child : 5068 Other : 271
## Sales :3715 Unmarried : 3446 White :27816
## Other-service :3696 Wife : 1568
## (Other) :8572
## sex capital.gain capital.loss hours.per.week
## Female:10771 Min. : 0 Min. : 0.0 Min. : 1.00
## Male :21790 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00
## Median : 0 Median : 0.0 Median :40.00
## Mean : 1078 Mean : 87.3 Mean :40.44
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :99999 Max. :4356.0 Max. :99.00
##
## native.country income
## United-States:29675 <=50K:24720
## Mexico : 657 >50K : 7841
## Philippines : 211
## Germany : 137
## Canada : 121
## Puerto-Rico : 114
## (Other) : 1646
#Deal with outliers for training sets
continuouscol <- c(1, 3, 5, 11, 12, 13) #subset continous variables
par(mfrow = c(2, 3))
for(i in continuouscol){
boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i])),
xlab = colnames(newtrain[i]))
}
for(i in continuouscol){
den_acc <- density(newtrain[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtrain[i])))
polygon(den_acc, col = "red", border = "blue")
}
outlierstrain <- list()
for(i in continuouscol){
outliers <- boxplot.stats(newtrain[, i])$out
numbers <- length(outliers)
outlierstrain[[i]] <- list(outliers, numbers)
}
head(outlierstrain)
## [[1]]
## [[1]][[1]]
## [1] 79 90 80 81 90 88 90 90 80 90 81 82 79 81 80 83 90 90 79 81 90 90 80
## [24] 90 90 79 79 84 90 80 90 81 83 84 81 79 85 82 79 80 90 90 90 84 80 90
## [47] 90 79 84 90 79 90 90 90 82 81 90 84 79 81 82 81 80 90 80 84 82 79 90
## [70] 84 90 83 79 81 80 79 80 79 80 90 90 80 90 90 81 83 82 90 90 81 80 80
## [93] 90 79 80 82 85 80 79 90 81 79 80 79 81 82 88 90 82 88 84 83 79 86 90
## [116] 90 82 83 81 79 90 80 81 79 84 84 79 90 80 81 81 81 90 87 90 80 80 82
## [139] 90 90 85 82 81
##
## [[1]][[2]]
## [1] 143
##
##
## [[2]]
## NULL
##
## [[3]]
## [[3]][[1]]
## [1] 544091 507875 446839 432376 494223 428030 483777 633742
## [9] 523910 635913 538583 477983 425161 860348 423158 481060
## [17] 416103 445382 1033222 426017 543162 433665 462440 556660
## [25] 430828 475028 420537 680390 499233 543028 465507 526968
## [33] 767403 431192 520586 445824 416745 444304 441454 421132
## [41] 795830 419721 509350 467108 444554 449257 441620 563883
## [49] 431745 436006 473040 910398 451940 428350 421871 443040
## [57] 420895 496743 429507 418324 538319 508336 445382 483201
## [65] 452205 672412 473547 421065 505119 460046 549430 441591
## [73] 438696 488720 482082 460835 519627 675421 481987 758700
## [81] 509364 432565 490332 466224 446219 423460 509364 656036
## [89] 443508 566117 436253 454508 427686 548510 545483 503012
## [97] 573583 511361 454941 452405 716416 480861 498785 637222
## [105] 430084 423770 417657 446358 457402 664821 462890 598606
## [113] 457237 465326 503923 572751 580248 519006 617021 437994
## [121] 596776 588905 517995 640383 504725 423863 420917 470663
## [129] 611029 437851 495888 549341 421837 746786 550848 510072
## [137] 449432 430471 416129 511331 446559 452640 456399 469705
## [145] 656036 488720 434710 449354 425627 417136 460835 416338
## [153] 424079 423561 688355 587310 628797 421449 424988 443508
## [161] 632613 499249 445758 416164 473133 450580 506329 445168
## [169] 516337 432376 571853 1184622 913447 476573 632593 595000
## [177] 703067 484475 476391 749105 459465 543922 420282 498325
## [185] 447579 420749 482732 437281 427965 505980 549349 496025
## [193] 562558 642830 435022 443546 523095 436770 436493 704108
## [201] 557082 477106 471452 426001 464536 451996 505980 454614
## [209] 473748 506858 434102 454989 537222 595000 454508 577521
## [217] 424012 431426 604506 564135 427781 469907 503675 444089
## [225] 435835 512103 716066 487486 484298 479765 444743 483596
## [233] 525878 423250 538443 493034 434292 496382 432154 528616
## [241] 515025 433491 421223 428350 446358 455995 659273 435604
## [249] 425092 452924 541737 444822 423024 445940 468706 428584
## [257] 972354 459189 498216 608184 444219 433788 586657 1226583
## [265] 664670 447346 504725 427055 561334 499001 791084 917220
## [273] 430084 508548 511289 416577 512992 431745 427862 637080
## [281] 431861 671292 442612 494638 431307 459007 517000 421446
## [289] 548361 648223 522881 433669 461678 416059 473836 745768
## [297] 523067 508891 486332 418176 417419 464945 454508 476653
## [305] 488706 647882 569761 585203 539563 1038553 567788 732569
## [313] 416165 721161 509629 474136 450924 477697 423711 419658
## [321] 553473 496414 421967 453067 466458 421561 483530 560804
## [329] 447079 528616 485496 425528 502316 467799 469921 444134
## [337] 443179 497300 426431 607848 501172 441700 483822 420973
## [345] 514033 470663 472604 487411 558183 416829 430005 426263
## [353] 439608 456236 420779 541282 518030 459248 548580 526528
## [361] 447739 586657 433375 581071 437727 575442 554986 592930
## [369] 632834 423052 504951 484861 449576 496538 459463 505438
## [377] 479482 467108 467108 849857 426562 558944 420054 691903
## [385] 419691 684015 423605 461678 466498 530099 554317 420054
## [393] 450920 427952 695136 698418 464103 526968 450695 548303
## [401] 529216 526164 506436 439919 734193 737315 544686 468713
## [409] 548361 556652 691830 520775 442429 433669 607799 660870
## [417] 440456 471990 483822 423222 500509 487742 498785 423064
## [425] 532379 426895 493862 424855 469602 432555 424468 428271
## [433] 464502 446140 480717 529104 456110 451744 680390 438711
## [441] 483450 419053 857532 454063 1484705 424034 421837 425447
## [449] 456956 434467 755858 523484 436861 654141 469864 424034
## [457] 458549 930948 664366 420629 456236 515629 606111 463667
## [465] 431637 509364 634226 458558 483261 420749 446358 428405
## [473] 451996 423297 568490 447882 450246 456236 448626 1268339
## [481] 467579 455995 698363 617860 615893 427382 565313 591711
## [489] 520231 461337 419554 460408 454915 448337 536725 472070
## [497] 430175 446771 485117 500002 462294 443508 418020 435638
## [505] 420277 511517 438139 462255 1366120 495061 420351 431245
## [513] 434894 441210 419394 593246 449432 473133 440138 462838
## [521] 423222 529223 456618 651396 451951 431861 517036 436361
## [529] 497788 529216 441637 526734 543042 428299 427744 501144
## [537] 417668 631947 489085 436798 443855 438427 437890 540712
## [545] 549174 460437 806552 604537 487085 436341 473748 484024
## [553] 1455435 445382 659504 416745 439263 556688 750972 424884
## [561] 607848 454915 419895 548256 493363 463194 450695 422149
## [569] 552354 469056 435503 561489 455361 578377 509500 889965
## [577] 462180 506329 428499 507086 419732 659558 440129 609935
## [585] 521400 608184 425804 415913 513660 424478 422960 445728
## [593] 467108 615367 557236 562336 427474 493443 443546 430554
## [601] 434097 520078 460408 454934 474617 485117 456618 660461
## [609] 423222 442035 533147 497253 617898 449354 419722 440607
## [617] 442045 450544 953588 425622 609789 598995 421633 609789
## [625] 424719 482732 469697 452283 663394 417668 530454 494784
## [633] 436107 543477 452452 481096 420054 495982 556902 421412
## [641] 432052 418405 732102 548256 476334 709445 463072 469454
## [649] 423616 456604 609789 570821 438176 416356 421561 636017
## [657] 703107 544792 434463 434114 423222 418961 595088 438996
## [665] 607848 433705 462832 476334 527162 470875 416415 456572
## [673] 422836 566049 602513 509060 448026 491000 488541 520033
## [681] 554206 429346 455379 443742 520759 421837 694812 578701
## [689] 422013 462869 456618 549413 598802 511289 464103 462294
## [697] 427422 440417 439919 424494 806316 459548 541343 438839
## [705] 439592 1033222 424468 599629 571017 416577 425199 738812
## [713] 497280 447066 477209 431513 618191 544268 557853 535978
## [721] 668319 423024 491421 682947 469572 574271 456460 478829
## [729] 816750 597843 442274 595461 553405 506329 704108 481987
## [737] 460408 515712 551962 572751 745817 422933 473171 481175
## [745] 433170 476558 420986 447488 446512 497486 433330 496856
## [753] 1161363 435836 424591 425049 441542 419691 433330 444607
## [761] 459342 452808 427474 447555 422718 673764 424494 418405
## [769] 446654 434467 479621 472789 454843 456062 588484 809585
## [777] 493689 445382 482927 503454 574271 462820 478994 434268
## [785] 501671 594187 439779 509462 435469 548664 422813 498079
## [793] 431515 447488 466502 558490 456661 509048 419146 468713
## [801] 653574 706026 511068 427965 452640 475324 470203 513416
## [809] 421561 417941 535978 422249 442274 721712 615367 472580
## [817] 549174 437825 1097453 423222 461715 471452 426836 442131
## [825] 477867 461929 478380 479611 419146 472807 515797 475322
## [833] 510072 570562 491000 419134 423024 473133 1085515 500720
## [841] 421633 511668 455361 521665 478457 548361 591711 518530
## [849] 594187 417668 452406 499197 434430 509866 504871 695411
## [857] 420986 442359 462966 761006 484669 423616 467611 440647
## [865] 506830 574005 478205 604045 465974 415913 605502 589809
## [873] 426467 487347 588003 509629 431426 429897 709798 561334
## [881] 481987 570002 443546 1125613 454915 440706 532845 498328
## [889] 604380 583755 437909 420691 510072 557349 501172 609789
## [897] 476599 424094 557644 706180 425785 606752 417668 673764
## [905] 460214 475324 547886 554206 430035 456236 419740 462832
## [913] 440129 584790 425804 481987 799281 657397 496526 426431
## [921] 440969 487330 444554 512771 466325 440969 512828 422275
## [929] 531055 437666 472166 653574 417605 502837 444304 436798
## [937] 745768 478346 857532 715938 747719 569930 423217 433989
## [945] 475322 585361 452402 425497 502752 492263 543922 766115
## [953] 461337 421561 456922 584259 493034 538822 542265 430283
## [961] 498349 431245 491862 420895 448337 418702 477505 421467
## [969] 469454 749636 433906 437727 668362 449101 981628 470368
## [977] 746432 451059 499935 473625 566537 456367 455553 693066
## [985] 539864 447346 478315 427686 435842 485710 436163 514716
##
## [[3]][[2]]
## [1] 992
##
##
## [[4]]
## NULL
##
## [[5]]
## [[5]][[1]]
## [1] 4 3 4 4 2 4 3 4 2 1 4 4 3 3 3 4 2 2 2 3 3 2 4 4 4 3 4 4 3 3 4 3 2 1
## [35] 4 4 4 4 2 2 3 3 4 3 4 3 4 4 3 2 4 4 4 4 3 4 4 4 4 4 4 2 4 4 4 4 3 3
## [69] 4 3 4 4 4 4 4 4 4 4 3 4 3 4 4 2 2 3 3 4 3 2 4 4 4 3 3 2 2 4 3 4 1 4
## [103] 1 4 4 4 3 3 4 3 4 4 4 2 4 3 4 3 3 3 1 4 4 4 4 4 1 4 4 4 3 3 4 4 4 4
## [137] 4 3 4 4 3 2 4 4 4 1 3 4 4 4 4 2 2 4 4 4 2 4 4 3 4 4 4 4 2 4 4 4 3 4
## [171] 3 3 3 4 2 4 4 2 4 4 4 3 4 4 4 3 4 3 4 3 4 3 4 2 3 3 4 4 3 3 4 2 4 3
## [205] 2 2 4 4 2 2 4 4 2 2 3 3 3 4 3 4 4 4 4 4 1 4 3 4 4 4 4 3 4 4 4 1 4 4
## [239] 4 4 4 4 4 4 1 3 4 1 4 4 2 4 2 4 4 4 3 3 3 4 4 4 4 3 2 2 4 4 3 4 4 2
## [273] 4 1 4 4 4 4 4 4 4 4 3 1 1 1 4 4 4 2 4 3 3 3 4 2 4 4 4 3 2 4 4 4 2 4
## [307] 1 4 4 4 4 3 2 2 4 4 4 3 3 3 2 2 4 3 4 3 4 4 4 4 3 4 3 4 4 3 4 4 4 3
## [341] 4 4 3 3 4 3 4 2 3 2 4 3 2 3 4 4 4 2 4 4 4 4 3 3 4 4 2 4 3 1 3 2 4 3
## [375] 3 4 3 3 4 4 2 4 3 2 3 4 3 4 4 3 3 2 4 4 4 3 4 3 4 1 4 4 2 2 4 3 1 4
## [409] 3 3 4 3 4 4 4 3 3 3 4 3 1 4 2 2 4 3 3 3 2 4 4 4 3 4 4 2 3 4 4 3 3 4
## [443] 3 4 4 4 4 4 4 4 3 2 4 3 4 4 3 2 4 2 4 4 4 3 4 3 4 4 4 2 4 4 3 3 4 3
## [477] 1 3 2 3 2 4 4 4 3 4 2 2 4 2 2 3 4 2 3 4 3 3 4 4 4 3 2 3 3 3 4 4 4 4
## [511] 2 3 4 3 2 3 3 3 4 3 4 3 4 4 4 3 4 3 2 4 4 3 3 4 3 4 3 4 3 3 3 2 3 3
## [545] 4 4 1 4 3 4 3 2 4 2 4 3 3 4 3 3 4 2 4 4 4 2 4 4 4 4 4 4 4 4 4 3 2 4
## [579] 2 4 4 3 4 4 4 4 4 3 3 4 2 4 4 3 1 3 4 4 1 3 4 4 4 4 3 4 2 4 4 4 4 2
## [613] 4 3 4 4 4 4 3 4 4 3 2 3 4 2 4 4 4 3 4 3 4 4 4 4 3 4 3 3 4 2 2 3 4 4
## [647] 3 4 4 3 4 3 3 4 4 4 4 4 4 3 3 4 3 2 1 4 4 3 4 3 4 3 3 4 3 4 2 2 4 4
## [681] 2 4 3 2 4 3 4 2 4 3 2 4 3 4 2 2 3 2 3 4 4 4 4 4 4 4 4 3 4 4 3 4 2 4
## [715] 4 4 4 4 4 4 2 4 4 4 4 3 4 3 4 3 1 4 4 3 2 4 3 3 4 4 3 3 4 4 4 3 2 4
## [749] 4 2 3 4 4 4 4 4 3 4 4 3 4 1 4 1 4 4 4 2 4 3 4 4 2 4 1 3 3 3 4 1 3 4
## [783] 4 3 2 4 2 4 4 3 4 3 4 4 1 4 2 3 3 3 2 4 3 4 4 4 4 2 1 2 4 3 4 4 4 3
## [817] 4 3 3 1 4 3 3 2 4 3 3 2 4 3 4 3 4 4 4 4 3 4 4 4 4 4 4 3 2 4 2 3 3 3
## [851] 4 4 4 4 3 3 4 4 4 3 3 2 4 4 4 4 1 4 2 4 4 4 4 3 4 4 4 2 4 4 4 4 1 4
## [885] 1 4 4 4 4 4 2 4 1 4 1 4 4 4 4 3 4 1 4 4 4 4 3 4 3 3 3 4 3 3 2 3 4 4
## [919] 4 1 4 2 4 4 4 4 3 4 3 4 4 3 1 4 4 4 3 4 2 4 4 3 4 3 4 4 3 2 4 4 4 1
## [953] 4 4 1 4 4 4 4 4 3 2 3 4 3 3 2 3 3 4 4 4 2 4 4 2 4 3 1 4 4 2 4 1 4 4
## [987] 3 3 3 3 3 4 3 4 3 3 2 4 3 4 4 4 4 4 4 3 4 3 3 4 3 4 3 2 4 4 4 3 4 3
## [1021] 4 3 2 2 4 2 4 4 4 4 2 4 2 3 3 2 3 4 1 4 3 3 3 4 3 4 2 4 4 3 3 4 2 3
## [1055] 3 4 3 4 3 3 4 2 3 4 4 3 4 3 4 4 4 4 4 4 4 3 4 4 4 4 3 3 4 2 3 4 3 3
## [1089] 2 2 2 2 4 4 3 2 4 4 4 3 2 2 3 4 3 2 4 2 4 4 3 4 4 4 3 4 4 4 3 3 4 3
## [1123] 3 3 4 3 3 4 2 3 4 4 2 4 2 2 2 4 3 4 4 3 3 2 2 4 2 4 3 3 2 4 3 2 4 3
## [1157] 3 4 4 4 4 4 4 2 1 4 2 2 4 4 2 4 4 1 2 4 4 4 3 3 3 1 4 2 3 4 1 4 4 2
## [1191] 3 2 4 4 1 4 4 4
##
## [[5]][[2]]
## [1] 1198
##
##
## [[6]]
## NULL
fnlwgttrainout <- tail(order(rank(newtrain[, 3])), 15)
fnlout <- c()
for(i in 1:length(fnlwgttrainout)){
fnlout[i] <- newtrain[fnlwgttrainout[i], 3]
}
#head(order(rank(newtrain[,5])))
table(newtrain[, 11])
##
## 0 114 401 594 914 991 1055 1086 1111 1151 1173 1409
## 29849 6 2 34 8 5 25 4 1 8 3 7
## 1424 1455 1471 1506 1639 1797 1831 1848 2009 2036 2050 2062
## 3 1 7 15 1 7 7 6 3 4 5 2
## 2105 2174 2176 2202 2228 2290 2329 2346 2354 2387 2407 2414
## 9 48 23 16 5 5 6 6 11 1 19 8
## 2463 2538 2580 2597 2635 2653 2829 2885 2907 2936 2961 2964
## 11 1 12 20 11 5 31 24 11 3 3 9
## 2977 2993 3103 3137 3273 3325 3411 3418 3432 3456 3464 3471
## 8 2 97 37 6 53 24 5 4 2 23 8
## 3674 3781 3818 3887 3908 3942 4064 4101 4386 4416 4508 4650
## 14 12 7 6 32 14 42 20 70 12 12 41
## 4687 4787 4865 4931 4934 5013 5060 5178 5455 5556 5721 6097
## 3 23 17 1 7 69 1 97 11 5 3 1
## 6360 6418 6497 6514 6723 6767 6849 7298 7430 7443 7688 7896
## 3 9 11 5 2 5 27 246 9 5 284 3
## 7978 8614 9386 9562 10520 10566 10605 11678 13550 14084 14344 15020
## 1 55 22 4 43 6 12 2 27 41 26 5
## 15024 15831 18481 20051 22040 25124 25236 27828 34095 41310 99999
## 347 6 2 37 1 4 11 34 5 2 159
gainout <- tail(order(rank(newtrain[, 11])), 159)
#Outliers removing for training sets.
dim(newtrain)
## [1] 32561 15
newtrain <- newtrain[-gainout, ]
dim(newtrain)
## [1] 32402 15
#Deal with outliers for testing sets
for(i in continuouscol){
boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i])),
xlab = colnames(newtest[i]))
}
for(i in continuouscol){
den_acc <- density(newtest[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtest[i])))
polygon(den_acc, col = "red", border = "blue")
}
outlierstest <- list()
for(i in continuouscol){
outliers <- boxplot.stats(newtest[, i])$out
numbers <- length(outliers)
outlierstest[[i]] <- list(outliers, numbers)
}
head(outlierstest)
## [[1]]
## [[1]][[1]]
## [1] 79 80 90 79 80 81 82 83 81 85 80 90 81 84 81 89 81 83 81 82 80 90 81
## [24] 83 80 90 90 84 80 80 80 81 90 85 90 81 81 80 80 79 81 80 88 87 90 79
## [47] 83 79 80 90 79 79 81 81 90 82 90 87 81 88 80 81 80 81 90 88 89 84 80
## [70] 80 83 79 81
##
## [[1]][[2]]
## [1] 73
##
##
## [[2]]
## NULL
##
## [[3]]
## [[3]][[1]]
## [1] 444554 432824 465326 445382 479296 428420 456736 537222
## [9] 513100 447488 512864 500068 446894 599057 479179 471990
## [17] 457162 455379 542610 479600 448026 437200 652784 573446
## [25] 453233 662460 426589 629900 499971 450770 481987 478373
## [33] 486194 509364 632733 504725 560313 651702 644278 535852
## [41] 445758 452353 475775 455469 522241 427744 473206 427541
## [49] 581128 444725 608881 490871 430151 431245 451019 430336
## [57] 433602 437994 436431 914061 624006 510072 484475 505365
## [65] 593246 714597 816750 491214 446724 552529 454717 425622
## [73] 575172 475322 622192 566066 493732 427437 427320 614113
## [81] 445365 472517 459556 548568 565769 429832 424988 426350
## [89] 789600 424340 447144 864960 497414 471876 723746 427422
## [97] 421837 692831 535869 433624 638116 467936 698039 427812
## [105] 472861 449101 677398 464621 547931 497039 451742 460322
## [113] 666014 474568 452640 765214 445480 761800 460356 1047822
## [121] 436651 544319 617917 450695 429696 443377 522881 437161
## [129] 421010 479296 459189 469005 457070 750972 505365 458609
## [137] 520231 589155 538193 428251 454321 455399 477345 470486
## [145] 437318 588739 449578 486436 588484 449101 528618 806552
## [153] 478354 467936 505168 858091 451327 482082 663291 447554
## [161] 451603 455995 460408 581025 453983 656488 421633 478457
## [169] 422836 557349 421350 498267 442478 421228 655066 426431
## [177] 494371 737315 541755 436198 594521 442656 491000 455995
## [185] 430672 496856 589838 479296 605504 490332 423453 445382
## [193] 558752 448862 429281 772919 884434 495288 488720 444554
## [201] 604045 437940 697806 632271 497788 464484 587310 467759
## [209] 472344 438587 427055 538243 441227 459465 454950 439777
## [217] 1490400 768659 764638 437458 517995 718736 433682 477083
## [225] 442478 547108 474229 498833 882849 453663 443508 498411
## [233] 504423 746660 488459 423883 457357 501671 786418 565313
## [241] 483201 466458 424934 450200 465334 482096 451603 465725
## [249] 502633 473133 477867 435356 478457 653215 437825 576645
## [257] 510643 538099 425502 432480 482211 539019 496743 455379
## [265] 421132 452402 531055 454076 434081 452402 434710 446947
## [273] 472411 594187 685955 442116 435835 430278 548361 606111
## [281] 459192 592029 426263 513977 647591 566066 553588 433325
## [289] 491607 624572 488706 535740 607118 482677 420973 426431
## [297] 580591 449172 438427 557853 446390 487751 469263 478972
## [305] 441949 430930 635913 485944 557805 626493 444134 433580
## [313] 493034 914061 456736 557349 443336 953588 473547 457710
## [321] 471768 558344 421871 430710 481258 590204 679853 421474
## [329] 443809 516701 443546 535762 438321 814850 427812 874728
## [337] 497525 434102 450141 441949 438429 506830 478277 594194
## [345] 445480 452963 498267 538583 602513 589809 421474 507492
## [353] 546118 446647 530099 453686 443377 1117718 427248 461725
## [361] 460259 849067 590941 572285 608441 720428 423311 436361
## [369] 463601 557359 454024 431515 590522 443546 433592 479406
## [377] 430195 421633 428299 484911 478836 513440 744929 534775
## [385] 511231 598995 456592 525848 442359 458168 457453 913447
## [393] 584259 694105 441227 448841 606347 437566 495366 1024535
## [401] 427474 811615 431551 461929 533660 445382 427475 1210504
## [409] 426263 425830 421837 427770 447210 455995 435836 425816
## [417] 490645 513977 553405 497414 742903 431745 553405 504941
## [425] 450141 456665 449376 487770 448026 443858 473449 440934
## [433] 456430 421200 426589 484879 438696 435638 535027 464552
## [441] 443701 438427 513719 439263 425444 454585 428251 618130
## [449] 542762 771836 473133 464552 435266 437161 462964 423605
## [457] 618808 573446 432204 461484 455379 504871 532969 455665
## [465] 425127 449925 427515 607658 422933 430340 440129
##
## [[3]][[2]]
## [1] 471
##
##
## [[4]]
## NULL
##
## [[5]]
## [[5]][[1]]
## [1] 4 4 3 4 4 4 4 4 4 3 2 3 4 4 2 4 4 3 3 2 4 3 3 4 3 3 4 4 4 1 1 4 3 2 4
## [36] 4 2 3 4 4 1 4 1 4 4 4 3 4 4 3 4 3 4 2 4 2 4 4 4 3 4 2 4 4 3 3 1 1 4 3
## [71] 4 2 3 4 3 3 3 4 4 4 4 4 3 3 3 2 2 4 4 4 4 3 3 4 3 3 3 3 1 2 3 3 3 1 4
## [106] 4 4 4 4 4 4 4 2 3 4 4 3 4 4 4 3 3 3 4 4 1 4 4 4 3 4 2 4 2 4 4 4 4 3 3
## [141] 4 4 1 4 3 4 4 4 3 4 4 4 3 3 3 4 2 2 4 2 4 4 4 4 4 4 4 4 4 2 4 4 3 4 1
## [176] 2 3 4 3 2 4 1 4 2 3 3 4 4 4 1 2 2 4 3 4 4 4 4 3 2 4 4 4 4 3 3 3 4 3 4
## [211] 2 4 4 4 3 4 3 2 4 4 3 4 2 2 4 1 2 3 4 2 4 4 4 4 4 2 4 4 4 3 4 3 4 3 4
## [246] 3 4 3 4 3 4 4 4 4 3 3 3 2 3 4 3 4 4 4 3 1 2 2 2 2 3 1 2 3 4 4 4 1 1 2
## [281] 4 4 4 4 2 4 3 4 3 1 3 3 1 3 4 4 4 4 4 4 3 3 3 3 3 3 4 4 4 4 3 4 4 3 2
## [316] 4 4 2 4 4 3 4 3 4 4 4 4 4 2 3 4 4 3 2 4 2 4 4 4 4 2 3 4 4 3 3 4 3 2 3
## [351] 4 2 3 4 4 3 4 4 2 4 4 3 2 4 4 4 2 4 4 4 3 4 3 3 4 2 4 2 3 3 3 4 3 4 3
## [386] 4 1 4 3 4 4 3 4 2 4 2 3 3 4 3 2 1 1 2 3 3 4 3 1 3 3 2 4 3 4 3 3 3 4 3
## [421] 4 4 2 3 3 3 3 1 3 3 2 4 3 4 1 2 3 4 4 4 4 4 4 3 3 2 3 4 4 3 4 2 4 4 4
## [456] 4 4 2 4 2 4 2 4 4 3 4 3 2 4 3 4 4 3 4 4 4 4 4 3 4 4 3 4 3 4 4 3 2 4 2
## [491] 2 4 2 4 3 4 4 3 4 3 4 3 4 1 1 4 3 2 4 4 4 4 3 3 4 4 2 4 4 4 3 4 3 1 4
## [526] 3 3 4 3 4 4 4 4 4 4 4 4 4 3 2 3 4 3 4 4 4 4 4 3 4 4 3 4 3 4 2 2 3 2 3
## [561] 3 3 4 4 4 1 3 3 3 4 4 1 3 4 2 3 3 3 2 3 3 4 4 4 3 4 4 1 4 4 4 4 4 4 4
## [596] 4
##
## [[5]][[2]]
## [1] 596
##
##
## [[6]]
## NULL
table(newtest[, 11])
##
## 0 114 401 594 914 991 1055 1086 1151 1173 1264 1409
## 14958 2 3 18 2 1 12 4 5 2 2 3
## 1424 1455 1471 1506 1731 1797 1831 1848 2036 2062 2105 2174
## 1 3 2 9 1 3 2 3 1 1 6 26
## 2176 2202 2290 2329 2346 2354 2407 2414 2463 2538 2580 2597
## 8 12 5 1 2 10 6 2 4 4 8 11
## 2635 2653 2829 2885 2907 2936 2961 2964 2977 2993 3103 3137
## 3 6 11 6 7 1 1 5 3 1 55 14
## 3273 3325 3411 3418 3456 3464 3471 3674 3781 3818 3887 3908
## 1 28 10 3 4 10 3 8 4 4 2 10
## 3942 4064 4101 4386 4416 4508 4650 4687 4787 4865 4931 4934
## 4 12 9 38 12 11 22 1 12 8 3 3
## 5013 5060 5178 5455 5556 5721 6097 6418 6497 6514 6612 6723
## 48 1 49 7 1 4 1 7 4 5 1 3
## 6767 6849 7262 7298 7430 7443 7688 7896 7978 8614 9386 9562
## 1 15 1 118 6 2 126 1 1 27 9 1
## 10520 10566 10605 11678 13550 14084 14344 15020 15024 15831 20051 25124
## 21 2 7 2 15 8 8 5 166 2 12 2
## 25236 27828 34095 41310 99999
## 3 24 1 1 85
gainout <- tail(order(rank(newtest[, 11])), 85)
#Outliers removing for training sets.
dim(newtest)
## [1] 16281 15
newtest <- newtest[-gainout, ]
dim(newtest)
## [1] 16196 15
#Plots after removing outliers training
for(i in continuouscol){
boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i]), "-outliers removed"),
xlab = colnames(newtrain[i]))
}
for(i in continuouscol){
den_acc <- density(newtrain[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtrain[i]), "-outliers removed"))
polygon(den_acc, col = "red", border = "blue")
}
#Plots after removing outliers testing
for(i in continuouscol){
boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i]), "-outliers removed"),
xlab = colnames(newtest[i]))
}
for(i in continuouscol){
den_acc <- density(newtest[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtest[i]), "-outliers removed"))
polygon(den_acc, col = "red", border = "blue")
}
\(\\\)
\(\\\)
#detach("package:plyr", unload=TRUE) #because plyr and dplyr existed together conflicting...
#Check whether categorical variables can be discretized....
plot(newtrain$workclass)
table(newtrain$workclass)
##
## Federal-gov Local-gov Never-worked Private
## 983 2187 9 23984
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## 1127 2747 1351 14
newtrain %>% group_by(workclass) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
## workclass n freq
## <fctr> <int> <dbl>
## 1 Federal-gov 983 0.0303376335
## 2 Local-gov 2187 0.0674958336
## 3 Never-worked 9 0.0002777606
## 4 Private 23984 0.7402012221
## 5 Self-emp-inc 1127 0.0347818036
## 6 Self-emp-not-inc 2747 0.0847787174
## 7 State-gov 1351 0.0416949571
## 8 Without-pay 14 0.0004320721
plot(newtest$workclass)
table(newtest$workclass)
##
## Federal-gov Local-gov Never-worked Private
## 480 1089 3 11919
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## 570 1421 707 7
newtest %>% group_by(workclass) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
## workclass n freq
## <fctr> <int> <dbl>
## 1 Federal-gov 480 0.0296369474
## 2 Local-gov 1089 0.0672388244
## 3 Never-worked 3 0.0001852309
## 4 Private 11919 0.7359224500
## 5 Self-emp-inc 570 0.0351938750
## 6 Self-emp-not-inc 1421 0.0877377130
## 7 State-gov 707 0.0436527538
## 8 Without-pay 7 0.0004322055
plot(newtrain$education)
table(newtrain$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 931 1175 433 168 333
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 646 513 1066 1381 5314
## Doctorate HS-grad Masters Preschool Prof-school
## 401 10478 1705 51 530
## Some-college
## 7277
newtrain %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education n freq
## <fctr> <int> <dbl>
## 1 10th 931 0.028732794
## 2 11th 1175 0.036263194
## 3 12th 433 0.013363373
## 4 1st-4th 168 0.005184865
## 5 5th-6th 333 0.010277143
## 6 7th-8th 646 0.019937041
## 7 9th 513 0.015832356
## 8 Assoc-acdm 1066 0.032899204
## 9 Assoc-voc 1381 0.042620826
## 10 Bachelors 5314 0.164002222
## 11 Doctorate 401 0.012375779
## 12 HS-grad 10478 0.323375100
## 13 Masters 1705 0.052620209
## 14 Preschool 51 0.001573977
## 15 Prof-school 530 0.016357015
## 16 Some-college 7277 0.224584902
plot(newtest$education)
table(newtest$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 456 637 224 79 175
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 309 242 534 677 2648
## Doctorate HS-grad Masters Preschool Prof-school
## 170 5272 922 32 236
## Some-college
## 3583
newtest %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education n freq
## <fctr> <int> <dbl>
## 1 10th 456 0.028155100
## 2 11th 637 0.039330699
## 3 12th 224 0.013830575
## 4 1st-4th 79 0.004877748
## 5 5th-6th 175 0.010805137
## 6 7th-8th 309 0.019078785
## 7 9th 242 0.014941961
## 8 Assoc-acdm 534 0.032971104
## 9 Assoc-voc 677 0.041800445
## 10 Bachelors 2648 0.163497160
## 11 Doctorate 170 0.010496419
## 12 HS-grad 5272 0.325512472
## 13 Masters 922 0.056927636
## 14 Preschool 32 0.001975796
## 15 Prof-school 236 0.014571499
## 16 Some-college 3583 0.221227464
plot(newtrain$marital.status)
table(newtrain$marital.status)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 4432 23 14844
## Married-spouse-absent Never-married Separated
## 417 10671 1023
## Widowed
## 992
newtrain %>% group_by(marital.status) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
## marital.status n freq
## <fctr> <int> <dbl>
## 1 Divorced 4432 0.1367816801
## 2 Married-AF-spouse 23 0.0007098327
## 3 Married-civ-spouse 14844 0.4581198691
## 4 Married-spouse-absent 417 0.0128695760
## 5 Never-married 10671 0.3293315227
## 6 Separated 1023 0.0315721252
## 7 Widowed 992 0.0306153941
plot(newtest$marital.status)
table(newtest$marital.status)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 2181 13 7340
## Married-spouse-absent Never-married Separated
## 210 5425 503
## Widowed
## 524
newtest %>% group_by(marital.status) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
## marital.status n freq
## <fctr> <int> <dbl>
## 1 Divorced 2181 0.1346628797
## 2 Married-AF-spouse 13 0.0008026673
## 3 Married-civ-spouse 7340 0.4531983206
## 4 Married-spouse-absent 210 0.0129661645
## 5 Never-married 5425 0.3349592492
## 6 Separated 503 0.0310570511
## 7 Widowed 524 0.0323536676
plot(newtrain$occupation)
table(newtrain$occupation)
##
## Adm-clerical Armed-Forces Craft-repair Exec-managerial
## 3986 9 4154 4085
## Farming-fishing Handlers-cleaners Machine-op-inspct Other-service
## 1185 1617 2184 3694
## Priv-house-serv Prof-specialty Protective-serv Sales
## 206 4228 734 3690
## Tech-support Transport-moving
## 992 1638
newtrain %>% group_by(occupation) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
## occupation n freq
## <fctr> <int> <dbl>
## 1 Adm-clerical 3986 0.1230170977
## 2 Armed-Forces 9 0.0002777606
## 3 Craft-repair 4154 0.1282019628
## 4 Exec-managerial 4085 0.1260724647
## 5 Farming-fishing 1185 0.0365718166
## 6 Handlers-cleaners 1617 0.0499043269
## 7 Machine-op-inspct 2184 0.0674032467
## 8 Other-service 3694 0.1140053083
## 9 Priv-house-serv 206 0.0063576322
## 10 Prof-specialty 4228 0.1304857725
## 11 Protective-serv 734 0.0226529227
## 12 Sales 3690 0.1138818591
## 13 Tech-support 992 0.0306153941
## 14 Transport-moving 1638 0.0505524350
plot(newtest$occupation)
table(newtest$occupation)
##
## Adm-clerical Armed-Forces Craft-repair Exec-managerial
## 1965 6 2032 2009
## Farming-fishing Handlers-cleaners Machine-op-inspct Other-service
## 576 864 1085 1824
## Priv-house-serv Prof-specialty Protective-serv Sales
## 133 2077 367 1912
## Tech-support Transport-moving
## 548 798
newtest %>% group_by(occupation) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
## occupation n freq
## <fctr> <int> <dbl>
## 1 Adm-clerical 1965 0.1213262534
## 2 Armed-Forces 6 0.0003704618
## 3 Craft-repair 2032 0.1254630773
## 4 Exec-managerial 2009 0.1240429736
## 5 Farming-fishing 576 0.0355643369
## 6 Handlers-cleaners 864 0.0533465053
## 7 Machine-op-inspct 1085 0.0669918498
## 8 Other-service 1824 0.1126204001
## 9 Priv-house-serv 133 0.0082119042
## 10 Prof-specialty 2077 0.1282415411
## 11 Protective-serv 367 0.0226599160
## 12 Sales 1912 0.1180538405
## 13 Tech-support 548 0.0338355149
## 14 Transport-moving 798 0.0492714250
plot(newtrain$relationship)
table(newtrain$relationship)
##
## Husband Not-in-family Other-relative Own-child Unmarried
## 13072 8284 981 5066 3442
## Wife
## 1557
newtrain %>% group_by(relationship) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
## relationship n freq
## <fctr> <int> <dbl>
## 1 Husband 13072 0.40343189
## 2 Not-in-family 8284 0.25566323
## 3 Other-relative 981 0.03027591
## 4 Own-child 5066 0.15634837
## 5 Unmarried 3442 0.10622801
## 6 Wife 1557 0.04805259
plot(newtest$relationship)
table(newtest$relationship)
##
## Husband Not-in-family Other-relative Own-child Unmarried
## 6465 4262 525 2511 1676
## Wife
## 757
newtest %>% group_by(relationship) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
## relationship n freq
## <fctr> <int> <dbl>
## 1 Husband 6465 0.39917264
## 2 Not-in-family 4262 0.26315140
## 3 Other-relative 525 0.03241541
## 4 Own-child 2511 0.15503828
## 5 Unmarried 1676 0.10348234
## 6 Wife 757 0.04673994
plot(newtrain$race)
table(newtrain$race)
##
## Amer-Indian-Eskimo Asian-Pac-Islander Black
## 311 1029 3117
## Other White
## 269 27676
newtrain %>% group_by(race) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
## race n freq
## <fctr> <int> <dbl>
## 1 Amer-Indian-Eskimo 311 0.009598173
## 2 Asian-Pac-Islander 1029 0.031757299
## 3 Black 3117 0.096197766
## 4 Other 269 0.008301957
## 5 White 27676 0.854144806
plot(newtest$race)
table(newtest$race)
##
## Amer-Indian-Eskimo Asian-Pac-Islander Black
## 159 475 1558
## Other White
## 134 13870
newtest %>% group_by(race) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
## race n freq
## <fctr> <int> <dbl>
## 1 Amer-Indian-Eskimo 159 0.009817239
## 2 Asian-Pac-Islander 475 0.029328229
## 3 Black 1558 0.096196592
## 4 Other 134 0.008273648
## 5 White 13870 0.856384292
plot(newtrain$sex)
table(newtrain$sex)
##
## Female Male
## 10749 21653
newtrain %>% group_by(sex) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
## sex n freq
## <fctr> <int> <dbl>
## 1 Female 10749 0.3317388
## 2 Male 21653 0.6682612
plot(newtest$sex)
table(newtest$sex)
##
## Female Male
## 5407 10789
newtest %>% group_by(sex) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
## sex n freq
## <fctr> <int> <dbl>
## 1 Female 5407 0.3338479
## 2 Male 10789 0.6661521
plot(newtrain$native.country)
table(newtrain$native.country)
##
## Cambodia Canada
## 20 120
## China Columbia
## 79 59
## Cuba Dominican-Republic
## 95 70
## Ecuador El-Salvador
## 28 106
## England France
## 90 29
## Germany Greece
## 137 29
## Guatemala Haiti
## 64 44
## Holand-Netherlands Honduras
## 1 13
## Hong Hungary
## 23 13
## India Iran
## 104 43
## Ireland Italy
## 24 74
## Jamaica Japan
## 81 66
## Laos Mexico
## 22 656
## Nicaragua Outlying-US(Guam-USVI-etc)
## 34 14
## Peru Philippines
## 31 210
## Poland Portugal
## 60 37
## Puerto-Rico Scotland
## 114 12
## South Taiwan
## 89 56
## Thailand Trinadad&Tobago
## 19 19
## United-States Vietnam
## 29528 73
## Yugoslavia
## 16
newtrain %>% group_by(native.country) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 41 x 3
## native.country n freq
## <fctr> <int> <dbl>
## 1 Cambodia 20 0.0006172458
## 2 Canada 120 0.0037034751
## 3 China 79 0.0024381211
## 4 Columbia 59 0.0018208753
## 5 Cuba 95 0.0029319178
## 6 Dominican-Republic 70 0.0021603605
## 7 Ecuador 28 0.0008641442
## 8 El-Salvador 106 0.0032714030
## 9 England 90 0.0027776063
## 10 France 29 0.0008950065
## # ... with 31 more rows
plot(newtest$native.country)
table(newtest$native.country)
##
## Cambodia Canada
## 12 61
## China Columbia
## 50 26
## Cuba Dominican-Republic
## 43 34
## Ecuador El-Salvador
## 17 49
## England France
## 38 9
## Germany Greece
## 69 20
## Guatemala Haiti
## 24 31
## Honduras Hong
## 7 10
## Hungary India
## 6 56
## Iran Ireland
## 16 13
## Italy Jamaica
## 32 25
## Japan Laos
## 32 5
## Mexico Nicaragua
## 310 15
## Outlying-US(Guam-USVI-etc) Peru
## 9 15
## Philippines Poland
## 109 27
## Portugal Puerto-Rico
## 30 70
## Scotland South
## 9 37
## Taiwan Thailand
## 17 13
## Trinadad&Tobago United-States
## 8 14813
## Vietnam Yugoslavia
## 22 7
newtest %>% group_by(native.country) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 40 x 3
## native.country n freq
## <fctr> <int> <dbl>
## 1 Cambodia 12 0.0007409237
## 2 Canada 61 0.0037663621
## 3 China 50 0.0030871820
## 4 Columbia 26 0.0016053347
## 5 Cuba 43 0.0026549765
## 6 Dominican-Republic 34 0.0020992838
## 7 Ecuador 17 0.0010496419
## 8 El-Salvador 49 0.0030254384
## 9 England 38 0.0023462583
## 10 France 9 0.0005556928
## # ... with 30 more rows
#Check collinearity issues
newtrain %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education n freq
## <fctr> <int> <dbl>
## 1 10th 931 0.028732794
## 2 11th 1175 0.036263194
## 3 12th 433 0.013363373
## 4 1st-4th 168 0.005184865
## 5 5th-6th 333 0.010277143
## 6 7th-8th 646 0.019937041
## 7 9th 513 0.015832356
## 8 Assoc-acdm 1066 0.032899204
## 9 Assoc-voc 1381 0.042620826
## 10 Bachelors 5314 0.164002222
## 11 Doctorate 401 0.012375779
## 12 HS-grad 10478 0.323375100
## 13 Masters 1705 0.052620209
## 14 Preschool 51 0.001573977
## 15 Prof-school 530 0.016357015
## 16 Some-college 7277 0.224584902
newtrain %>% group_by(education.num) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education.num n freq
## <int> <int> <dbl>
## 1 1 51 0.001573977
## 2 2 168 0.005184865
## 3 3 333 0.010277143
## 4 4 646 0.019937041
## 5 5 513 0.015832356
## 6 6 931 0.028732794
## 7 7 1175 0.036263194
## 8 8 433 0.013363373
## 9 9 10478 0.323375100
## 10 10 7277 0.224584902
## 11 11 1381 0.042620826
## 12 12 1066 0.032899204
## 13 13 5314 0.164002222
## 14 14 1705 0.052620209
## 15 15 530 0.016357015
## 16 16 401 0.012375779
newtrain <- newtrain[, -4]
newtest <- newtest[, -4]
\(\\\)
\(\\\)
#Find correlations of the data - for collinearity issue checks
cor(newtest[, c(1, 3, 4, 10, 12)])
## age fnlwgt education.num capital.gain
## age 1.00000000 -0.0759176992 0.01555523 0.1080390077
## fnlwgt -0.07591770 1.0000000000 -0.02926279 -0.0007549241
## education.num 0.01555523 -0.0292627902 1.00000000 0.1417220957
## capital.gain 0.10803901 -0.0007549241 0.14172210 1.0000000000
## hours.per.week 0.07425722 -0.0026773627 0.12954445 0.0833160656
## hours.per.week
## age 0.074257217
## fnlwgt -0.002677363
## education.num 0.129544454
## capital.gain 0.083316066
## hours.per.week 1.000000000
cor(newtrain[, c(1, 3, 4, 10, 12)])
## age fnlwgt education.num capital.gain
## age 1.00000000 -0.076917052 0.03330048 0.116518227
## fnlwgt -0.07691705 1.000000000 -0.04362125 -0.004506565
## education.num 0.03330048 -0.043621248 1.00000000 0.145735884
## capital.gain 0.11651823 -0.004506565 0.14573588 1.000000000
## hours.per.week 0.06774934 -0.019547738 0.14384089 0.082952143
## hours.per.week
## age 0.06774934
## fnlwgt -0.01954774
## education.num 0.14384089
## capital.gain 0.08295214
## hours.per.week 1.00000000
#remove fnlwght variable.
newtrain <- newtrain[, -3]
newtest <- newtest[, -3]
#See structure and summaries after removing outliers
str(newtest)
## 'data.frame': 16196 obs. of 13 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 50 40 40 30 30 40 32 40 10 ...
## $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
## $ income : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
## age workclass education.num
## Min. :17.00 Private :11919 Min. : 1.00
## 1st Qu.:28.00 Self-emp-not-inc: 1421 1st Qu.: 9.00
## Median :37.00 Local-gov : 1089 Median :10.00
## Mean :38.72 State-gov : 707 Mean :10.06
## 3rd Qu.:48.00 Self-emp-inc : 570 3rd Qu.:12.00
## Max. :90.00 Federal-gov : 480 Max. :16.00
## (Other) : 10
## marital.status occupation relationship
## Divorced :2181 Prof-specialty :2077 Husband :6465
## Married-AF-spouse : 13 Craft-repair :2032 Not-in-family :4262
## Married-civ-spouse :7340 Exec-managerial:2009 Other-relative: 525
## Married-spouse-absent: 210 Adm-clerical :1965 Own-child :2511
## Never-married :5425 Sales :1912 Unmarried :1676
## Separated : 503 Other-service :1824 Wife : 757
## Widowed : 524 (Other) :4377
## race sex capital.gain
## Amer-Indian-Eskimo: 159 Female: 5407 Min. : 0.0
## Asian-Pac-Islander: 475 Male :10789 1st Qu.: 0.0
## Black : 1558 Median : 0.0
## Other : 134 Mean : 562.8
## White :13870 3rd Qu.: 0.0
## Max. :41310.0
##
## capital.loss hours.per.week native.country income
## Min. : 0.00 Min. : 1.00 United-States:14813 <=50K.:12435
## 1st Qu.: 0.00 1st Qu.:40.00 Mexico : 310 >50K. : 3761
## Median : 0.00 Median :40.00 Philippines : 109
## Mean : 88.36 Mean :40.33 Puerto-Rico : 70
## 3rd Qu.: 0.00 3rd Qu.:45.00 Germany : 69
## Max. :3770.00 Max. :99.00 Canada : 61
## (Other) : 764
str(newtrain)
## 'data.frame': 32402 obs. of 13 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
## age workclass education.num
## Min. :17.00 Private :23984 Min. : 1.00
## 1st Qu.:28.00 Self-emp-not-inc: 2747 1st Qu.: 9.00
## Median :37.00 Local-gov : 2187 Median :10.00
## Mean :38.54 State-gov : 1351 Mean :10.07
## 3rd Qu.:48.00 Self-emp-inc : 1127 3rd Qu.:12.00
## Max. :90.00 Federal-gov : 983 Max. :16.00
## (Other) : 23
## marital.status occupation
## Divorced : 4432 Prof-specialty :4228
## Married-AF-spouse : 23 Craft-repair :4154
## Married-civ-spouse :14844 Exec-managerial:4085
## Married-spouse-absent: 417 Adm-clerical :3986
## Never-married :10671 Other-service :3694
## Separated : 1023 Sales :3690
## Widowed : 992 (Other) :8565
## relationship race sex
## Husband :13072 Amer-Indian-Eskimo: 311 Female:10749
## Not-in-family : 8284 Asian-Pac-Islander: 1029 Male :21653
## Other-relative: 981 Black : 3117
## Own-child : 5066 Other : 269
## Unmarried : 3442 White :27676
## Wife : 1557
##
## capital.gain capital.loss hours.per.week native.country
## Min. : 0.0 Min. : 0.00 Min. : 1.00 United-States:29528
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:40.00 Mexico : 656
## Median : 0.0 Median : 0.00 Median :40.00 Philippines : 210
## Mean : 592.2 Mean : 87.73 Mean :40.39 Germany : 137
## 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.:45.00 Canada : 120
## Max. :41310.0 Max. :4356.00 Max. :99.00 Puerto-Rico : 114
## (Other) : 1637
## income
## <=50K:24720
## >50K : 7682
##
##
##
##
##
#Analyzing/checking before discretizing
# table(newtrain[,14])
# table(newtest[,14])
#
# plot(newtrain$education)
# plot(newtrain$occupation)
# plot(newtrain$native.country)
#
# plot(newtest$education)
# plot(newtest$occupation)
# plot(newtest$native.country)
#Discretize training set
# discretetrainage <- discretize(newtrain$age, method = "interval", categories = 10)
# discretetrainfnlwgt <- discretize(newtrain$fnlwgt, method = "interval", categories = 10)
# discretetrainedunum <- discretize(newtrain$education.num, method = "interval", categories = 10)
# discretetraingain <- discretize(newtrain$capital.gain, method = "interval", categories = 10)
# discretetrainloss <- discretize(newtrain$capital.loss, method = "interval", categories = 10)
# discretetrainhours <- discretize(newtrain$hours.per.week, method = "interval", categories = 10)
#Binning
countrydis <- function(vector){
len <- length(vector)
for(i in 1:len){
if(vector[i] == "United-States"){
vector[i] <- vector[i]
}else if(vector[i] == "Mexico"){
vector[i] <- vector[i]
}else if(vector[i] == "Philippines"){
vector[i] <- vector[i]
}else{
vector[i] <- "other_countries"
}
}
return(vector)
}
workdis <- function(vector){
len <- length(vector)
for(i in 1:len){
if(vector[i] == "Federal-gov"){
vector[i] <- vector[i]
}else if(vector[i] == "Local-gov"){
vector[i] <- vector[i]
}else if(vector[i] == "Private"){
vector[i] <- vector[i]
}else if(vector[i] == "Self-emp-inc"){
vector[i] <- vector[i]
}else if(vector[i] == "Self-emp-not-inc"){
vector[i] <- vector[i]
}else if(vector[i] == "State-gov"){
vector[i] <- vector[i]
}else{
vector[i] <- "No-gain"
}
}
return(vector)
}
#discretetraincountry <- as.factor(countrydis(as.character(newtrain$native.country)))
#Discretize testing set
# discretetestage <- discretize(newtest$age, method = "interval", categories = 10)
# discretetestfnlwgt <- discretize(newtest$fnlwgt, method = "interval", categories = 10)
# discretetestedunum <- discretize(newtest$education.num, method = "interval", categories = 10)
# discretetestgain <- discretize(newtest$capital.gain, method = "interval", categories = 10)
# discretetestloss <- discretize(newtest$capital.loss, method = "interval", categories = 10)
# discretetesthours <- discretize(newtest$hours.per.week, method = "interval", categories = 10)
# discretetestcountry <- as.factor(countrydis(as.character(newtest$native.country)))
#Combine training and testing to make the same intervals for discretizing
newtrain$type <- "train"
newtest$type <- "test"
combined <- rbind(newtrain, newtest)
# discreteage <- discretize(combined$age, method = "interval", categories = 10)
# discretefnlwgt <- discretize(combined$fnlwgt, method = "interval", categories = 10)
# discreteedunum <- discretize(combined$education.num, method = "interval", categories = 10)
# discretegain <- discretize(combined$capital.gain, method = "interval", categories = 7) #not enough data
# discreteloss <- discretize(combined$capital.loss, method = "interval", categories = 7) #not enough data
# discretehours <- discretize(combined$hours.per.week, method = "interval", categories = 10)
discretecountry <- as.factor(countrydis(as.character(combined$native.country)))
discreteworkclass <- as.factor(workdis(as.character(combined$workclass)))
# combined$age <- discreteage
# combined$fnlwgt <- discretefnlwgt
# combined$education.num <- discreteedunum
# combined$capital.gain <- discretegain
# combined$capital.loss <- discreteloss
# combined$hours.per.week <- discretehours
combined$native.country <- discretecountry
combined$workclass <- discreteworkclass
dim(combined)
## [1] 48598 14
newtrain2 <- combined[1:sum(combined$type == "train"), -14]
newtest2 <- combined[(sum(combined$type == "train") + 1):nrow(combined), -14]
dim(newtrain2)
## [1] 32402 13
dim(newtest2)
## [1] 16196 13
#plots
par(mfrow = c(2, 2)) #set how many plots on the palete.
for(i in 1:12){
plot(newtrain2[, i], newtrain2[, 13])
}
for(i in 1:12){
plot(newtest2[, i], newtest2[, 13])
}
#Assignining discretized variables
# newtrain2 <- newtrain
# newtest2 <- newtest
# dim(newtrain2)
# dim(newtest2)
#
# newtrain2$age <- discretetrainage
# newtrain2$fnlwgt <- discretetrainfnlwgt
# newtrain2$education.num <- discretetrainedunum
# newtrain2$capital.gain <- discretetraingain
# newtrain2$capital.loss <- discretetrainloss
# newtrain2$hours.per.week <- discretetrainhours
# newtrain2$native.country <- discretetraincountry
#
# newtest2$age <- discretetestage
# newtest2$fnlwgt <- discretetestfnlwgt
# newtest2$education.num <- discretetestedunum
# newtest2$capital.gain <- discretetestgain
# newtest2$capital.loss <- discretetestloss
# newtest2$hours.per.week <- discretetesthours
# newtest2$native.country <- discretetestcountry
#Dummify training set
dumtrainwork <- dummy(newtrain2$workclass)
dumtrainmarry <- dummy(newtrain2$marital.status)
dumtrainoccu <- dummy(newtrain2$occupation)
dumtrainrelation <- dummy(newtrain2$relationship)
dumtrainrace <- dummy(newtrain2$race)
dumtrainsex <- dummy(newtrain2$sex)
dumtraincountry <- dummy(newtrain2$native.country)
#Dummify testing set
dumtestwork <- dummy(newtest2$workclass)
dumtestmarry <- dummy(newtest2$marital.status)
dumtestoccu <- dummy(newtest2$occupation)
dumtestrelation <- dummy(newtest2$relationship)
dumtestrace <- dummy(newtest2$race)
dumtestsex <- dummy(newtest2$sex)
dumtestcountry <- dummy(newtest2$native.country)
#Take out columns
newtrain2 <- newtrain2[, -c(2, 4, 5, 6, 7, 8, 12)]
newtest2 <- newtest2[, -c(2, 4, 5, 6, 7, 8, 12)]
#Assigning dummified variables
newtrain2 <- cbind(newtrain2, dumtrainwork, dumtrainmarry, dumtrainoccu,
dumtrainrelation, dumtrainrace, dumtrainsex, dumtraincountry)
newtrain2[, 45] <- newtrain2$income
newtrain2 <- newtrain2[, -6]
names(newtrain2)[44]<- "income"
dim(newtrain2)
## [1] 32402 44
newtest2 <- cbind(newtest2, dumtestwork, dumtestmarry, dumtestoccu,
dumtestrelation, dumtestrace, dumtestsex, dumtestcountry)
newtest2[, 45] <- newtest2$income
newtest2 <- newtest2[, -6]
names(newtest2)[44]<- "income"
dim(newtest2)
## [1] 16196 44
#fixing...
newtrain2$income <- droplevels(newtrain2$income, c("<=50K.", ">50K."))
newtest2$income <- droplevels(newtest2$income, c("<=50K", ">50K"))
newtest2$income <- as.character(newtest2$income)
newtest2$income <- substr(newtest2$income, 1, nchar(newtest2$income) - 1)
newtest2$income <- as.factor(newtest2$income)
dim(newtrain2)
## [1] 32402 44
dim(newtest2)
## [1] 16196 44
str(newtrain2)
## 'data.frame': 32402 obs. of 44 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ Local-gov : num 0 0 0 0 0 0 0 0 0 0 ...
## $ No-gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : num 0 0 1 1 1 1 1 0 1 1 ...
## $ Self-emp-inc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Self-emp-not-inc : num 0 1 0 0 0 0 0 1 0 0 ...
## $ State-gov : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Married-AF-spouse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Married-civ-spouse : num 0 1 0 1 1 1 0 1 0 1 ...
## $ Married-spouse-absent: num 0 0 0 0 0 0 1 0 0 0 ...
## $ Never-married : num 1 0 0 0 0 0 0 0 1 0 ...
## $ Separated : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed-Forces : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft-repair : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Exec-managerial : num 0 1 0 0 0 1 0 1 0 1 ...
## $ Farming-fishing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Handlers-cleaners : num 0 0 1 1 0 0 0 0 0 0 ...
## $ Machine-op-inspct : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other-service : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Priv-house-serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof-specialty : num 0 0 0 0 1 0 0 0 1 0 ...
## $ Protective-serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sales : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Tech-support : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport-moving : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Not-in-family : num 1 0 1 0 0 0 1 0 1 0 ...
## $ Other-relative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Own-child : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Unmarried : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Wife : num 0 0 0 0 1 1 0 0 0 0 ...
## $ Asian-Pac-Islander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : num 0 0 0 1 1 0 1 0 0 0 ...
## $ Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ White : num 1 1 1 0 0 1 0 1 1 1 ...
## $ Male : num 1 1 1 1 0 0 0 1 0 1 ...
## $ other_countries : num 0 0 0 0 1 0 1 0 0 0 ...
## $ Philippines : num 0 0 0 0 0 0 0 0 0 0 ...
## $ United-States : num 1 1 1 1 0 1 0 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
str(newtest2)
## 'data.frame': 16196 obs. of 44 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ Local-gov : num 0 0 1 0 0 0 0 0 0 0 ...
## $ No-gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : num 1 1 0 1 1 1 1 0 1 1 ...
## $ Self-emp-inc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Self-emp-not-inc : num 0 0 0 0 0 0 0 1 0 0 ...
## $ State-gov : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Married-AF-spouse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Married-civ-spouse : num 0 1 1 1 0 0 0 1 0 1 ...
## $ Married-spouse-absent: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Never-married : num 1 0 0 0 1 1 1 0 1 0 ...
## $ Separated : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed-Forces : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft-repair : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Exec-managerial : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Farming-fishing : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Handlers-cleaners : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Machine-op-inspct : num 1 0 0 1 0 0 0 0 0 0 ...
## $ Other-service : num 0 0 0 0 0 1 0 0 1 0 ...
## $ Priv-house-serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof-specialty : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Protective-serv : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Sales : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Tech-support : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport-moving : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Not-in-family : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Other-relative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Own-child : num 1 0 0 0 1 0 0 0 0 0 ...
## $ Unmarried : num 0 0 0 0 0 0 1 0 1 0 ...
## $ Wife : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian-Pac-Islander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : num 1 0 0 1 0 0 1 0 0 0 ...
## $ Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ White : num 0 1 1 0 1 1 0 1 1 1 ...
## $ Male : num 1 1 1 1 0 1 1 1 0 1 ...
## $ other_countries : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Philippines : num 0 0 0 0 0 0 0 0 0 0 ...
## $ United-States : num 1 1 1 1 1 1 1 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...